source("./load_files.R")data_files_selected <-
list.files(
pattern = ".*carolnovoattila.*csv$",
recursive = TRUE,
ignore.case = TRUE
)
data_files_and_size <-
sapply(data_files_selected, file.size)
files_to_include_in_dataframe <-
tibble(
"Files" = names(data_files_and_size),
"Size (in MB)" = data_files_and_size/1E6
)
skimr::skim(files_to_include_in_dataframe)| Name | files_to_include_in_dataf… |
| Number of rows | 2 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Files | 0 | 1 | 101 | 103 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Size (in MB) | 0 | 1 | 39.85 | 35.83 | 14.52 | 27.19 | 39.85 | 52.52 | 65.19 | ▇▁▁▁▇ |
cdr <- load_cdr(names(data_files_and_size))
cdr %<>%
mutate(
expgroup = case_when(
str_detect(file, "carol") ~ "carol",
TRUE ~ "unknown"),
cycle = case_when(
str_detect(file, "R3") ~ "R3",
str_detect(file, "R4") ~ "R4",
TRUE ~ "unknown"),
time = case_when(
str_detect(file, "Initial") ~ "initial",
str_detect(file, "Final") ~ "final",
TRUE ~ "unknown")) %>%
select(cdr3, cycle, time, expgroup, everything())
cdr %<>%
group_by(cdr3, expgroup) %>%
arrange(cycle, desc(time), .by_group = TRUE) %>%
mutate(
fcp = cdrp / lag(cdrp, default = first(cdrp)),
fcq = quantity / lag(quantity, default = first(quantity))
) %>%
select(cdr3:quantity, fcp, fcq, everything())cdr %>%
filter(time == "final") %>%
group_by(expgroup, cycle, time) %>%
arrange(desc(fcp)) %>%
slice_head(prop = .1) %>%
# slice_head(n = 1000) %>%
ggplot(aes(expgroup, log10(fcp))) +
geom_violin(aes(fill = expgroup, color = expgroup), alpha = 0.5) +
geom_jitter(aes(shape = expgroup), alpha = 0.6, size = 1) +
stat_summary(
fun = mean,
fun.min = mean,
fun.max = mean,
geom = "crossbar",
# width = 0.5,
aes(color = expgroup)
) +
facet_grid(. ~ cycle)# cdr %>%
# filter(str_detect(cycle, "R0")) %>%
# filter(time == "final") %>%
# group_by(expgroup, cycle, time) %>%
# arrange(desc(fcp)) %>%
# slice_head(n = 1000) %>%
# ggplot(aes(fcp, color = expgroup, fill = expgroup)) +
# geom_density(stat = "bin", alpha = 0.3) +
# facet_grid(cycle ~ expgroup)
cdr %<>%
group_by(expgroup, cycle, time) %>%
arrange(desc(fcp)) %>%
slice_head(prop = .1) %>%
mutate(
threshold = mean(log10(fcp))
) %>%
mutate(
rich = if_else(
(log10(fcp) >= threshold) &
# (time == "final") &
# (str_detect(cycle, "R0")),
(time == "final"),
"rich",
"medium")) %>%
full_join(cdr) %>%
mutate(
rich = if_else(
is.na(rich),
"poor",
rich)) %>%
mutate(
rich = factor(rich,
levels = c("rich", "medium", "poor"))
) %>%
mutate(
threshold = if_else(
is.na(threshold),
0,
threshold
)
)
cdr %<>%
drop_na()
cdr %>%
filter(rich == "rich") %>%
ggplot() +
geom_violin(aes(expgroup, log10(fcp), fill = expgroup)) +
geom_jitter(aes(expgroup, log10(fcp), shape = expgroup), alpha = .2) +
facet_grid(rich ~ cycle)cdr %>%
filter(!rich == "rich") %>%
ggplot() +
geom_violin(aes(expgroup, log10(fcp), fill = expgroup)) +
geom_jitter(aes(expgroup, log10(fcp), shape = expgroup), alpha = .2) +
facet_grid(rich ~ cycle)names(cdr)## [1] "cdr3" "cycle" "time" "expgroup" "cdrp" "quantity"
## [7] "fcp" "fcq" "length" "MW" "AV" "IP"
## [13] "flex" "gravy" "SSF_Helix" "SSF_Turn" "SSF_Sheet" "n_A"
## [19] "n_C" "n_D" "n_E" "n_F" "n_G" "n_H"
## [25] "n_I" "n_K" "n_L" "n_M" "n_N" "n_P"
## [31] "n_Q" "n_R" "n_S" "n_T" "n_V" "n_W"
## [37] "n_Y" "aliphatic" "aromatic" "neutral" "positive" "negative"
## [43] "invalid" "file" "threshold" "rich"
# cdr %>%
# filter(rich == "rich") %>%
# ungroup() %>%
# select(cdr3:SSF_Sheet & !c("cycle", "time", "expgroup")) %>%
# ggplot() +
# geom_density(aes(AV))
#
# cdr %>%
# filter(rich == "poor") %>%
# ungroup() %>%
# select(cdr3:SSF_Sheet & !c("cycle", "time", "expgroup")) %>%
# ggplot() +
# geom_density(aes(AV))
cdr %>%
ggplot() +
geom_density(aes(AV, color = rich)) +
facet_grid(cycle ~ .)cdr %>%
ggplot() +
geom_density(aes(MW, color = rich)) +
facet_grid(cycle ~ .)cdr %>%
ggplot() +
geom_density(aes(gravy, color = rich)) +
facet_grid(cycle ~ .)cdr %>%
ggplot() +
geom_density(aes(SSF_Turn, color = rich)) +
facet_grid(cycle ~ .)cdr %>%
ggplot() +
geom_density(aes(IP, color = rich)) +
facet_grid(cycle ~ .)cdr %>%
ggplot() +
geom_density(aes(flex, color = rich)) +
facet_grid(cycle ~ .)cdr %>%
ggplot(aes(cycle, MW)) +
geom_violin(aes(fill = rich)) +
geom_jitter(alpha = .1) +
facet_grid(rich ~ .)cdr %>%
filter(rich == "rich") %>%
ungroup() %>%
select(cdr3:SSF_Sheet & !c("cycle", "time", "expgroup")) -> rich66
rich66## # A tibble: 2,985 x 14
## cdr3 cdrp quantity fcp fcq length MW AV IP flex gravy
## <chr> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 DNSW… 5.26e-4 791 385. 396. 8 1059. 0.375 4.05 0.735 -0.862
## 2 DRIT… 1.75e-2 26378 216. 222. 11 1222. 0.0909 5.84 0.734 -0.136
## 3 RMVR… 1.35e-4 203 197. 203 9 1108. 0.111 8.75 0.705 0.0667
## 4 GSSW… 1.32e-4 198 193. 198 9 1055. 0.222 6.00 0.784 -1.79
## 5 DESH… 6.19e-4 930 181. 186 14 1622. 0.214 4.05 0.773 -1.63
## 6 SEQR… 3.71e-4 557 181. 186. 10 1169. 0.1 4.05 0.8 -1.73
## 7 ARTF… 1.23e-4 185 180. 185 10 1207. 0.4 8.63 0.705 -0.570
## 8 VGRN 1.14e-4 171 166. 171 4 444. 0 9.72 0.767 -1.05
## 9 GEQW… 1.08e-4 163 158. 163 11 1435. 0.273 6.75 0.724 -0.854
## 10 ENDD… 1.07e-4 161 157. 161 9 1202. 0.333 4.05 0.754 -1.41
## # … with 2,975 more rows, and 3 more variables: SSF_Helix <dbl>,
## # SSF_Turn <dbl>, SSF_Sheet <dbl>
names(cdr)## [1] "cdr3" "cycle" "time" "expgroup" "cdrp" "quantity"
## [7] "fcp" "fcq" "length" "MW" "AV" "IP"
## [13] "flex" "gravy" "SSF_Helix" "SSF_Turn" "SSF_Sheet" "n_A"
## [19] "n_C" "n_D" "n_E" "n_F" "n_G" "n_H"
## [25] "n_I" "n_K" "n_L" "n_M" "n_N" "n_P"
## [31] "n_Q" "n_R" "n_S" "n_T" "n_V" "n_W"
## [37] "n_Y" "aliphatic" "aromatic" "neutral" "positive" "negative"
## [43] "invalid" "file" "threshold" "rich"
cdr %>%
filter(cycle == "R4") %>%
ggplot() +
geom_violin(aes(rich, gravy, color = rich)) +
geom_jitter(aes(rich, gravy), alpha = .1)library(Rtsne)
set.seed(42)
tsne_df <- cdr %>%
filter(time == "final") %>%
filter(cycle == "R4") %>%
group_by(rich) %>%
slice_sample(prop = .1)
tsne_df %>%
summarise(across(everything(), ~ all(sum(is.na(.x))))) %>%
select(where(isTRUE))## # A tibble: 3 x 0
tsne_df %>%
filter(is.na(threshold)) %>%
select(rich, threshold, quantity)## # A tibble: 0 x 3
## # Groups: rich [0]
## # … with 3 variables: rich <fct>, threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <-
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() %>%
Rtsne(
X = .,
dims = 3,
perplexity = 30,
theta = 0.5,
max_iter = 1E3,
verbose = T,
pca_center = T,
pca_scale = T,
# partial_pca = T,
normalize = T,
eta = 200.0,
exaggeration_factor = 12.0,
num_threads = parallel::detectCores() - 2
)## Performing PCA
## Read the 6567 x 42 data matrix successfully!
## OpenMP is working. 6 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 8.62 seconds (sparsity = 0.020275)!
## Learning embedding...
## Iteration 50: error is 91.821924 (50 iterations in 10.42 seconds)
## Iteration 100: error is 89.240834 (50 iterations in 9.70 seconds)
## Iteration 150: error is 89.089068 (50 iterations in 8.74 seconds)
## Iteration 200: error is 89.034821 (50 iterations in 9.42 seconds)
## Iteration 250: error is 89.004621 (50 iterations in 10.15 seconds)
## Iteration 300: error is 3.241007 (50 iterations in 8.25 seconds)
## Iteration 350: error is 2.882568 (50 iterations in 7.90 seconds)
## Iteration 400: error is 2.701125 (50 iterations in 8.52 seconds)
## Iteration 450: error is 2.586016 (50 iterations in 8.89 seconds)
## Iteration 500: error is 2.507843 (50 iterations in 9.19 seconds)
## Iteration 550: error is 2.451932 (50 iterations in 9.18 seconds)
## Iteration 600: error is 2.411179 (50 iterations in 9.30 seconds)
## Iteration 650: error is 2.381859 (50 iterations in 9.48 seconds)
## Iteration 700: error is 2.359850 (50 iterations in 9.87 seconds)
## Iteration 750: error is 2.343236 (50 iterations in 8.79 seconds)
## Iteration 800: error is 2.330313 (50 iterations in 9.07 seconds)
## Iteration 850: error is 2.320353 (50 iterations in 9.20 seconds)
## Iteration 900: error is 2.312231 (50 iterations in 9.65 seconds)
## Iteration 950: error is 2.305241 (50 iterations in 9.66 seconds)
## Iteration 1000: error is 2.299074 (50 iterations in 9.82 seconds)
## Fitting performed in 185.21 seconds.
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() -> a
tsne_out %>%
.$Y %>%
as_tibble() %>%
ggplot() +
geom_point(aes(V1, V2, color = a$rich))tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
title = "Sample title",
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "With Quantity Variables")library(Rtsne)
set.seed(42)
tsne_df <- cdr %>%
# filter(!rich == "poor") %>%
filter(time == "final") %>%
filter(cycle == "R4") %>%
group_by(rich) %>%
slice_sample(prop = .1)
tsne_df %>%
summarise(across(everything(), ~ all(sum(is.na(.x))))) %>%
select(where(isTRUE))## # A tibble: 3 x 0
tsne_df %>%
filter(is.na(threshold)) %>%
select(rich, threshold, quantity)## # A tibble: 0 x 3
## # Groups: rich [0]
## # … with 3 variables: rich <fct>, threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <-
tsne_df %>%
ungroup() %>%
select(cdr3, !where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
select(!c("quantity", "cdrp", "fcp", "fcq", "rich", "threshold")) %>%
mutate(cdr3 = as.factor(cdr3)) %>%
Rtsne(
X = .,
dims = 3,
perplexity = 30,
theta = 0.5,
max_iter = 1E3,
verbose = T,
pca_center = T,
pca_scale = T,
normalize = T,
partial_pca = T,
eta = 200.0,
exaggeration_factor = 12.0,
num_threads = parallel::detectCores()
)## Performing PCA
## Read the 6575 x 50 data matrix successfully!
## OpenMP is working. 8 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 11.38 seconds (sparsity = 0.021765)!
## Learning embedding...
## Iteration 50: error is 90.625826 (50 iterations in 19.62 seconds)
## Iteration 100: error is 90.162903 (50 iterations in 23.18 seconds)
## Iteration 150: error is 90.132397 (50 iterations in 21.53 seconds)
## Iteration 200: error is 90.123344 (50 iterations in 21.65 seconds)
## Iteration 250: error is 90.119227 (50 iterations in 20.40 seconds)
## Iteration 300: error is 3.859909 (50 iterations in 17.87 seconds)
## Iteration 350: error is 3.456857 (50 iterations in 12.45 seconds)
## Iteration 400: error is 3.272986 (50 iterations in 14.23 seconds)
## Iteration 450: error is 3.161528 (50 iterations in 13.90 seconds)
## Iteration 500: error is 3.085411 (50 iterations in 13.63 seconds)
## Iteration 550: error is 3.031465 (50 iterations in 15.05 seconds)
## Iteration 600: error is 2.991372 (50 iterations in 14.24 seconds)
## Iteration 650: error is 2.960647 (50 iterations in 15.34 seconds)
## Iteration 700: error is 2.937113 (50 iterations in 14.57 seconds)
## Iteration 750: error is 2.918461 (50 iterations in 14.45 seconds)
## Iteration 800: error is 2.904167 (50 iterations in 15.21 seconds)
## Iteration 850: error is 2.892766 (50 iterations in 14.53 seconds)
## Iteration 900: error is 2.884979 (50 iterations in 15.84 seconds)
## Iteration 950: error is 2.878619 (50 iterations in 14.62 seconds)
## Iteration 1000: error is 2.873402 (50 iterations in 15.27 seconds)
## Fitting performed in 327.57 seconds.
tsne_df %>%
ungroup() %>%
select(cdr3, !where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
select(!c("quantity", "cdrp", "fcp", "fcq", "threshold")) -> a
tsne_out %>%
.$Y %>%
as_tibble() %>%
ggplot() +
geom_point(aes(V1, V2, color = a$rich))tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "Without Quantity Variables")